library(data.table)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(geosphere)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following object is masked from 'package:base':
##
## date
masterdata <- read.csv("new_MASTER_01_data.csv")
summary(masterdata)
## X tripduration start.station.id
## Min. : 1 Min. : 61.0 519 : 1558
## 1st Qu.: 51380 1st Qu.: 363.0 497 : 1227
## Median :102759 Median : 616.0 3255 : 1200
## Mean :102759 Mean : 992.7 285 : 1145
## 3rd Qu.:154138 3rd Qu.: 1081.0 402 : 1125
## Max. :205517 Max. :2678003.0 435 : 1089
## (Other):198173
## start.station.name start.station.latitude start.station.longitude
## Pershing Square North: 1558 Min. :40.66 Min. :-74.03
## E 17 St & Broadway : 1227 1st Qu.:40.72 1st Qu.:-74.00
## 8 Ave & W 31 St : 1200 Median :40.74 Median :-73.99
## Broadway & E 14 St : 1145 Mean :40.74 Mean :-73.98
## Broadway & E 22 St : 1125 3rd Qu.:40.76 3rd Qu.:-73.97
## W 21 St & 6 Ave : 1089 Max. :40.86 Max. :-73.89
## (Other) :198173
## end.station.id end.station.name end.station.latitude
## 519 : 1604 Pershing Square North: 1604 Min. :40.66
## 497 : 1254 E 17 St & Broadway : 1254 1st Qu.:40.72
## 402 : 1194 Broadway & E 22 St : 1194 Median :40.74
## 3255 : 1169 8 Ave & W 31 St : 1169 Mean :40.74
## 285 : 1157 Broadway & E 14 St : 1157 3rd Qu.:40.76
## 426 : 1120 West St & Chambers St: 1120 Max. :40.86
## (Other):198019 (Other) :198019
## end.station.longitude bikeid usertype birth.year
## Min. :-74.05 Min. :14529 Customer : 28805 Min. :1886
## 1st Qu.:-74.00 1st Qu.:25323 Subscriber:176712 1st Qu.:1969
## Median :-73.99 Median :30947 Median :1983
## Mean :-73.98 Mean :29669 Mean :1980
## 3rd Qu.:-73.97 3rd Qu.:35053 3rd Qu.:1990
## Max. :-73.89 Max. :42046 Max. :2003
##
## gender AWND AWND_ATTRIBUTES PRCP
## Min. :0.000 Min. : 1.120 : 22301 Min. :0.000
## 1st Qu.:1.000 1st Qu.: 2.910 ,,W:183216 1st Qu.:0.000
## Median :1.000 Median : 4.030 Median :0.000
## Mean :1.164 Mean : 4.385 Mean :0.106
## 3rd Qu.:1.000 3rd Qu.: 5.140 3rd Qu.:0.040
## Max. :2.000 Max. :12.750 Max. :1.830
## NA's :22301
## PRCP_ATTRIBUTES SNOW SNOW_ATTRIBUTES SNWD
## ,,W,2400 :186524 Min. :0.000 : 545 Min. :0.00000
## T,,W,2400: 18993 1st Qu.:0.000 ,,W,2400 :201841 1st Qu.:0.00000
## Median :0.000 T,,W,2400: 3131 Median :0.00000
## Mean :0.019 Mean :0.02829
## 3rd Qu.:0.000 3rd Qu.:0.00000
## Max. :4.000 Max. :3.90000
## NA's :545
## SNWD_ATTRIBUTES TAVG TAVG_ATTRIBUTES TMAX
## ,,W,2400 :204127 Mode:logical Mode:logical Min. :14.00
## T,,W,2400: 1390 NA's:205517 NA's:205517 1st Qu.:57.00
## Median :71.00
## Mean :68.17
## 3rd Qu.:81.00
## Max. :95.00
##
## TMAX_ATTRIBUTES TMIN TMIN_ATTRIBUTES WDF2
## ,,W:205517 Min. : 2.00 ,,W:205517 Min. : 10.0
## 1st Qu.:42.00 1st Qu.: 60.0
## Median :56.00 Median :220.0
## Mean :53.63 Mean :182.2
## 3rd Qu.:67.00 3rd Qu.:280.0
## Max. :82.00 Max. :360.0
## NA's :22301
## WDF2_ATTRIBUTES WDF5 WDF5_ATTRIBUTES WSF2
## : 22301 Min. : 10.0 : 22700 Min. : 6.90
## ,,W:183216 1st Qu.: 70.0 ,,W:182817 1st Qu.:10.10
## Median :220.0 Median :12.10
## Mean :183.6 Mean :12.81
## 3rd Qu.:270.0 3rd Qu.:15.00
## Max. :360.0 Max. :25.10
## NA's :22700 NA's :22301
## WSF2_ATTRIBUTES WSF5 WSF5_ATTRIBUTES WT01
## : 22301 Min. :11.00 : 22700 Min. :1
## ,,W:183216 1st Qu.:17.00 ,,W:182817 1st Qu.:1
## Median :19.90 Median :1
## Mean :20.77 Mean :1
## 3rd Qu.:23.00 3rd Qu.:1
## Max. :40.90 Max. :1
## NA's :22700 NA's :123167
## WT01_ATTRIBUTES WT02 WT02_ATTRIBUTES WT03
## :123167 Min. :1 :201690 Min. :1
## ,,W: 82350 1st Qu.:1 ,,W: 3827 1st Qu.:1
## Median :1 Median :1
## Mean :1 Mean :1
## 3rd Qu.:1 3rd Qu.:1
## Max. :1 Max. :1
## NA's :201690 NA's :186419
## WT03_ATTRIBUTES WT06 WT06_ATTRIBUTES WT08
## :186419 Min. :1 :204101 Min. :1
## ,,W: 19098 1st Qu.:1 ,,W: 1416 1st Qu.:1
## Median :1 Median :1
## Mean :1 Mean :1
## 3rd Qu.:1 3rd Qu.:1
## Max. :1 Max. :1
## NA's :204101 NA's :172801
## WT08_ATTRIBUTES newStartTime
## :172801 2019-03-01 17:41:27.7210: 2
## ,,W: 32716 2019-07-31 17:48:23.5580: 2
## 2019-01-01 00:35:03.5980: 1
## 2019-01-01 01:14:01.5150: 1
## 2019-01-01 01:59:10.1080: 1
## 2019-01-01 02:47:03.7040: 1
## (Other) :205509
## newStopTime
## 2019-05-28 09:10:01.3380: 2
## 2019-07-12 08:43:08.3900: 2
## 2019-01-01 00:38:10.6250: 1
## 2019-01-01 01:58:41.1290: 1
## 2019-01-01 02:12:34.9820: 1
## 2019-01-01 02:55:16.4380: 1
## (Other) :205509
str(masterdata)
## 'data.frame': 205517 obs. of 48 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ tripduration : int 110 1067 325 552 282 1150 178 777 423 144 ...
## $ start.station.id : Factor w/ 906 levels "116","119","120",..: 253 838 98 855 645 850 250 873 214 16 ...
## $ start.station.name : Factor w/ 908 levels "1 Ave & E 110 St",..: 868 208 409 199 747 717 785 126 155 205 ...
## $ start.station.latitude : num 40.8 40.8 40.8 40.7 40.7 ...
## $ start.station.longitude: num -74 -74 -74 -74 -74 ...
## $ end.station.id : Factor w/ 906 levels "116","119","120",..: 213 586 543 787 628 71 423 421 273 16 ...
## $ end.station.name : Factor w/ 908 levels "1 Ave & E 110 St",..: 863 200 390 193 356 677 643 398 289 207 ...
## $ end.station.latitude : num 40.8 40.7 40.8 40.7 40.7 ...
## $ end.station.longitude : num -74 -74 -74 -74 -74 ...
## $ bikeid : int 38891 38269 14654 15101 32868 30584 32492 30258 36783 36111 ...
## $ usertype : Factor w/ 2 levels "Customer","Subscriber": 2 2 2 2 2 2 2 2 2 1 ...
## $ birth.year : int 1989 1965 1990 1977 1996 1988 1954 1989 1977 1961 ...
## $ gender : int 1 1 1 1 1 1 1 1 1 1 ...
## $ AWND : num 3.36 5.37 2.91 1.79 2.91 2.91 2.68 2.91 7.83 4.47 ...
## $ AWND_ATTRIBUTES : Factor w/ 2 levels "",",,W": 2 2 2 2 2 2 2 2 2 2 ...
## $ PRCP : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PRCP_ATTRIBUTES : Factor w/ 2 levels ",,W,2400","T,,W,2400": 2 2 1 2 1 1 1 1 1 1 ...
## $ SNOW : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SNOW_ATTRIBUTES : Factor w/ 3 levels "",",,W,2400",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ SNWD : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SNWD_ATTRIBUTES : Factor w/ 2 levels ",,W,2400","T,,W,2400": 1 1 1 1 1 1 1 1 1 1 ...
## $ TAVG : logi NA NA NA NA NA NA ...
## $ TAVG_ATTRIBUTES : logi NA NA NA NA NA NA ...
## $ TMAX : int 87 39 70 87 85 85 80 88 49 60 ...
## $ TMAX_ATTRIBUTES : Factor w/ 1 level ",,W": 1 1 1 1 1 1 1 1 1 1 ...
## $ TMIN : int 73 32 52 75 72 68 63 75 33 38 ...
## $ TMIN_ATTRIBUTES : Factor w/ 1 level ",,W": 1 1 1 1 1 1 1 1 1 1 ...
## $ WDF2 : int 70 250 40 60 220 290 150 140 10 260 ...
## $ WDF2_ATTRIBUTES : Factor w/ 2 levels "",",,W": 2 2 2 2 2 2 2 2 2 2 ...
## $ WDF5 : int 40 220 40 70 220 290 150 140 360 260 ...
## $ WDF5_ATTRIBUTES : Factor w/ 2 levels "",",,W": 2 2 2 2 2 2 2 2 2 2 ...
## $ WSF2 : num 8.9 13 8.9 10.1 12.1 8.9 8.9 8.9 16.1 13 ...
## $ WSF2_ATTRIBUTES : Factor w/ 2 levels "",",,W": 2 2 2 2 2 2 2 2 2 2 ...
## $ WSF5 : num 17 19.9 13 13 19 15 16.1 15 25.1 23 ...
## $ WSF5_ATTRIBUTES : Factor w/ 2 levels "",",,W": 2 2 2 2 2 2 2 2 2 2 ...
## $ WT01 : int 1 NA NA NA NA NA NA NA NA NA ...
## $ WT01_ATTRIBUTES : Factor w/ 2 levels "",",,W": 2 1 1 1 1 1 1 1 1 1 ...
## $ WT02 : int NA NA NA NA NA NA NA NA NA NA ...
## $ WT02_ATTRIBUTES : Factor w/ 2 levels "",",,W": 1 1 1 1 1 1 1 1 1 1 ...
## $ WT03 : int NA NA NA NA NA NA NA NA NA NA ...
## $ WT03_ATTRIBUTES : Factor w/ 2 levels "",",,W": 1 1 1 1 1 1 1 1 1 1 ...
## $ WT06 : int NA NA NA NA NA NA NA NA NA NA ...
## $ WT06_ATTRIBUTES : Factor w/ 2 levels "",",,W": 1 1 1 1 1 1 1 1 1 1 ...
## $ WT08 : int NA 1 NA NA NA NA 1 NA NA NA ...
## $ WT08_ATTRIBUTES : Factor w/ 2 levels "",",,W": 1 2 1 1 1 1 2 1 1 1 ...
## $ newStartTime : Factor w/ 205515 levels "2019-01-01 00:35:03.5980",..: 124947 197101 150074 92205 108911 100289 70041 126789 29559 28200 ...
## $ newStopTime : Factor w/ 205515 levels "2019-01-01 00:38:10.6250",..: 124930 197107 150056 92205 108899 100292 70035 126787 29556 28200 ...
masterdata$bikeid <- as.factor(masterdata$bikeid)
masterdata$gender <- as.factor(masterdata$gender)
masterdata$gender <- as.factor(ifelse(masterdata$gender == "0", "Unknown", ifelse(masterdata$gender == "1", "Male", "Female")))
masterdata$X <- NULL
masterdata$starttime <- NULL
masterdata$stoptime <- NULL
masterdata$newStartTime = as.POSIXct(strptime(masterdata$newStartTime, "%Y-%m-%d %H:%M:%S"))
masterdata$newStopTime = as.POSIXct(strptime(masterdata$newStopTime, "%Y-%m-%d %H:%M:%S"))
masterdata$newStartDate <- as.Date(masterdata$newStartTime)
masterdata$newStopDate <- as.Date(masterdata$newStopTime)
#distance
masterstart <- as.data.frame(matrix(nrow = 205517, ncol = 0))
masterstart$startlong <- as.numeric(masterdata$start.station.longitude)
masterstart$startlat <- as.numeric(masterdata$start.station.latitude)
masterend <- as.data.frame(matrix(nrow = 205517, ncol = 0))
masterend$endlong <- masterdata$end.station.longitude
masterend$endlat <- masterdata$end.station.latitude
masterdata$distanceH <- distHaversine(masterstart, masterend, r=6378137)
#speed column
masterdata$speedMetersperMin <- masterdata$distanceH / masterdata$tripduration
masterend <- NULL
masterstart <- NULL
#new age classification
masterdata$agegroup <- as.factor(ifelse(masterdata$birth.year >= 2000, "GenZ", ifelse(masterdata$birth.year >= 1981, "Millennial", ifelse(masterdata$birth.year >= 1965, "GenX", ifelse(masterdata$birth.year >= 1946, "Boomer", ifelse(masterdata$birth.year >= 1928, "Silent", "VeryOld"))))))
#order factor levels for visualizations
masterdata$agegroup <- factor(masterdata$agegroup, levels = c("GenZ", "Millennial", "GenX", "Boomer", "Silent", "VeryOld"))
#new columns for months for visualizations
masterdata$startMonth <- month(masterdata$newStartDate)
masterdata$stopMonth <- month(masterdata$newStopDate)
masterdata$startMonthFactor <- as.factor(month(masterdata$newStartDate))
masterdata$stopMonthFactor <- as.factor(month(masterdata$newStopDate))
str(masterdata)
## 'data.frame': 205517 obs. of 56 variables:
## $ tripduration : int 110 1067 325 552 282 1150 178 777 423 144 ...
## $ start.station.id : Factor w/ 906 levels "116","119","120",..: 253 838 98 855 645 850 250 873 214 16 ...
## $ start.station.name : Factor w/ 908 levels "1 Ave & E 110 St",..: 868 208 409 199 747 717 785 126 155 205 ...
## $ start.station.latitude : num 40.8 40.8 40.8 40.7 40.7 ...
## $ start.station.longitude: num -74 -74 -74 -74 -74 ...
## $ end.station.id : Factor w/ 906 levels "116","119","120",..: 213 586 543 787 628 71 423 421 273 16 ...
## $ end.station.name : Factor w/ 908 levels "1 Ave & E 110 St",..: 863 200 390 193 356 677 643 398 289 207 ...
## $ end.station.latitude : num 40.8 40.7 40.8 40.7 40.7 ...
## $ end.station.longitude : num -74 -74 -74 -74 -74 ...
## $ bikeid : Factor w/ 19094 levels "14529","14530",..: 16053 15465 97 422 11882 9967 11539 9673 14577 14187 ...
## $ usertype : Factor w/ 2 levels "Customer","Subscriber": 2 2 2 2 2 2 2 2 2 1 ...
## $ birth.year : int 1989 1965 1990 1977 1996 1988 1954 1989 1977 1961 ...
## $ gender : Factor w/ 3 levels "Female","Male",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ AWND : num 3.36 5.37 2.91 1.79 2.91 2.91 2.68 2.91 7.83 4.47 ...
## $ AWND_ATTRIBUTES : Factor w/ 2 levels "",",,W": 2 2 2 2 2 2 2 2 2 2 ...
## $ PRCP : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PRCP_ATTRIBUTES : Factor w/ 2 levels ",,W,2400","T,,W,2400": 2 2 1 2 1 1 1 1 1 1 ...
## $ SNOW : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SNOW_ATTRIBUTES : Factor w/ 3 levels "",",,W,2400",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ SNWD : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SNWD_ATTRIBUTES : Factor w/ 2 levels ",,W,2400","T,,W,2400": 1 1 1 1 1 1 1 1 1 1 ...
## $ TAVG : logi NA NA NA NA NA NA ...
## $ TAVG_ATTRIBUTES : logi NA NA NA NA NA NA ...
## $ TMAX : int 87 39 70 87 85 85 80 88 49 60 ...
## $ TMAX_ATTRIBUTES : Factor w/ 1 level ",,W": 1 1 1 1 1 1 1 1 1 1 ...
## $ TMIN : int 73 32 52 75 72 68 63 75 33 38 ...
## $ TMIN_ATTRIBUTES : Factor w/ 1 level ",,W": 1 1 1 1 1 1 1 1 1 1 ...
## $ WDF2 : int 70 250 40 60 220 290 150 140 10 260 ...
## $ WDF2_ATTRIBUTES : Factor w/ 2 levels "",",,W": 2 2 2 2 2 2 2 2 2 2 ...
## $ WDF5 : int 40 220 40 70 220 290 150 140 360 260 ...
## $ WDF5_ATTRIBUTES : Factor w/ 2 levels "",",,W": 2 2 2 2 2 2 2 2 2 2 ...
## $ WSF2 : num 8.9 13 8.9 10.1 12.1 8.9 8.9 8.9 16.1 13 ...
## $ WSF2_ATTRIBUTES : Factor w/ 2 levels "",",,W": 2 2 2 2 2 2 2 2 2 2 ...
## $ WSF5 : num 17 19.9 13 13 19 15 16.1 15 25.1 23 ...
## $ WSF5_ATTRIBUTES : Factor w/ 2 levels "",",,W": 2 2 2 2 2 2 2 2 2 2 ...
## $ WT01 : int 1 NA NA NA NA NA NA NA NA NA ...
## $ WT01_ATTRIBUTES : Factor w/ 2 levels "",",,W": 2 1 1 1 1 1 1 1 1 1 ...
## $ WT02 : int NA NA NA NA NA NA NA NA NA NA ...
## $ WT02_ATTRIBUTES : Factor w/ 2 levels "",",,W": 1 1 1 1 1 1 1 1 1 1 ...
## $ WT03 : int NA NA NA NA NA NA NA NA NA NA ...
## $ WT03_ATTRIBUTES : Factor w/ 2 levels "",",,W": 1 1 1 1 1 1 1 1 1 1 ...
## $ WT06 : int NA NA NA NA NA NA NA NA NA NA ...
## $ WT06_ATTRIBUTES : Factor w/ 2 levels "",",,W": 1 1 1 1 1 1 1 1 1 1 ...
## $ WT08 : int NA 1 NA NA NA NA 1 NA NA NA ...
## $ WT08_ATTRIBUTES : Factor w/ 2 levels "",",,W": 1 2 1 1 1 1 2 1 1 1 ...
## $ newStartTime : POSIXct, format: "2019-08-17 13:10:36" "2019-12-04 18:48:45" ...
## $ newStopTime : POSIXct, format: "2019-08-17 13:12:27" "2019-12-04 19:06:32" ...
## $ newStartDate : Date, format: "2019-08-17" "2019-12-04" ...
## $ newStopDate : Date, format: "2019-08-17" "2019-12-05" ...
## $ distanceH : num 413 2567 922 657 1099 ...
## $ speedMetersperMin : num 3.75 2.41 2.84 1.19 3.9 ...
## $ agegroup : Factor w/ 6 levels "GenZ","Millennial",..: 2 3 2 3 2 2 4 2 3 4 ...
## $ startMonth : num 8 12 9 7 7 7 6 8 3 3 ...
## $ stopMonth : num 8 12 9 7 7 7 6 8 3 3 ...
## $ startMonthFactor : Factor w/ 12 levels "1","2","3","4",..: 8 12 9 7 7 7 6 8 3 3 ...
## $ stopMonthFactor : Factor w/ 12 levels "1","2","3","4",..: 8 12 9 7 7 7 6 8 3 3 ...
summary(masterdata)
## tripduration start.station.id start.station.name
## Min. : 61.0 519 : 1558 Pershing Square North: 1558
## 1st Qu.: 363.0 497 : 1227 E 17 St & Broadway : 1227
## Median : 616.0 3255 : 1200 8 Ave & W 31 St : 1200
## Mean : 992.7 285 : 1145 Broadway & E 14 St : 1145
## 3rd Qu.: 1081.0 402 : 1125 Broadway & E 22 St : 1125
## Max. :2678003.0 435 : 1089 W 21 St & 6 Ave : 1089
## (Other):198173 (Other) :198173
## start.station.latitude start.station.longitude end.station.id
## Min. :40.66 Min. :-74.03 519 : 1604
## 1st Qu.:40.72 1st Qu.:-74.00 497 : 1254
## Median :40.74 Median :-73.99 402 : 1194
## Mean :40.74 Mean :-73.98 3255 : 1169
## 3rd Qu.:40.76 3rd Qu.:-73.97 285 : 1157
## Max. :40.86 Max. :-73.89 426 : 1120
## (Other):198019
## end.station.name end.station.latitude end.station.longitude
## Pershing Square North: 1604 Min. :40.66 Min. :-74.05
## E 17 St & Broadway : 1254 1st Qu.:40.72 1st Qu.:-74.00
## Broadway & E 22 St : 1194 Median :40.74 Median :-73.99
## 8 Ave & W 31 St : 1169 Mean :40.74 Mean :-73.98
## Broadway & E 14 St : 1157 3rd Qu.:40.76 3rd Qu.:-73.97
## West St & Chambers St: 1120 Max. :40.86 Max. :-73.89
## (Other) :198019
## bikeid usertype birth.year gender
## 35306 : 44 Customer : 28805 Min. :1886 Female : 49419
## 34019 : 41 Subscriber:176712 1st Qu.:1969 Male :140370
## 34958 : 41 Median :1983 Unknown: 15728
## 35029 : 41 Mean :1980
## 35324 : 41 3rd Qu.:1990
## 33885 : 40 Max. :2003
## (Other):205269
## AWND AWND_ATTRIBUTES PRCP PRCP_ATTRIBUTES
## Min. : 1.120 : 22301 Min. :0.000 ,,W,2400 :186524
## 1st Qu.: 2.910 ,,W:183216 1st Qu.:0.000 T,,W,2400: 18993
## Median : 4.030 Median :0.000
## Mean : 4.385 Mean :0.106
## 3rd Qu.: 5.140 3rd Qu.:0.040
## Max. :12.750 Max. :1.830
## NA's :22301
## SNOW SNOW_ATTRIBUTES SNWD SNWD_ATTRIBUTES
## Min. :0.000 : 545 Min. :0.00000 ,,W,2400 :204127
## 1st Qu.:0.000 ,,W,2400 :201841 1st Qu.:0.00000 T,,W,2400: 1390
## Median :0.000 T,,W,2400: 3131 Median :0.00000
## Mean :0.019 Mean :0.02829
## 3rd Qu.:0.000 3rd Qu.:0.00000
## Max. :4.000 Max. :3.90000
## NA's :545
## TAVG TAVG_ATTRIBUTES TMAX TMAX_ATTRIBUTES TMIN
## Mode:logical Mode:logical Min. :14.00 ,,W:205517 Min. : 2.00
## NA's:205517 NA's:205517 1st Qu.:57.00 1st Qu.:42.00
## Median :71.00 Median :56.00
## Mean :68.17 Mean :53.63
## 3rd Qu.:81.00 3rd Qu.:67.00
## Max. :95.00 Max. :82.00
##
## TMIN_ATTRIBUTES WDF2 WDF2_ATTRIBUTES WDF5
## ,,W:205517 Min. : 10.0 : 22301 Min. : 10.0
## 1st Qu.: 60.0 ,,W:183216 1st Qu.: 70.0
## Median :220.0 Median :220.0
## Mean :182.2 Mean :183.6
## 3rd Qu.:280.0 3rd Qu.:270.0
## Max. :360.0 Max. :360.0
## NA's :22301 NA's :22700
## WDF5_ATTRIBUTES WSF2 WSF2_ATTRIBUTES WSF5
## : 22700 Min. : 6.90 : 22301 Min. :11.00
## ,,W:182817 1st Qu.:10.10 ,,W:183216 1st Qu.:17.00
## Median :12.10 Median :19.90
## Mean :12.81 Mean :20.77
## 3rd Qu.:15.00 3rd Qu.:23.00
## Max. :25.10 Max. :40.90
## NA's :22301 NA's :22700
## WSF5_ATTRIBUTES WT01 WT01_ATTRIBUTES WT02
## : 22700 Min. :1 :123167 Min. :1
## ,,W:182817 1st Qu.:1 ,,W: 82350 1st Qu.:1
## Median :1 Median :1
## Mean :1 Mean :1
## 3rd Qu.:1 3rd Qu.:1
## Max. :1 Max. :1
## NA's :123167 NA's :201690
## WT02_ATTRIBUTES WT03 WT03_ATTRIBUTES WT06
## :201690 Min. :1 :186419 Min. :1
## ,,W: 3827 1st Qu.:1 ,,W: 19098 1st Qu.:1
## Median :1 Median :1
## Mean :1 Mean :1
## 3rd Qu.:1 3rd Qu.:1
## Max. :1 Max. :1
## NA's :186419 NA's :204101
## WT06_ATTRIBUTES WT08 WT08_ATTRIBUTES newStartTime
## :204101 Min. :1 :172801 Min. :2019-01-01 00:35:03
## ,,W: 1416 1st Qu.:1 ,,W: 32716 1st Qu.:2019-05-03 06:30:28
## Median :1 Median :2019-07-18 16:48:45
## Mean :1 Mean :2019-07-12 13:31:09
## 3rd Qu.:1 3rd Qu.:2019-09-23 18:04:58
## Max. :1 Max. :2019-12-31 23:33:21
## NA's :172801
## newStopTime newStartDate newStopDate
## Min. :2019-01-01 00:38:10 Min. :2019-01-01 Min. :2019-01-01
## 1st Qu.:2019-05-03 06:49:10 1st Qu.:2019-05-03 1st Qu.:2019-05-03
## Median :2019-07-18 16:59:56 Median :2019-07-18 Median :2019-07-18
## Mean :2019-07-12 13:47:42 Mean :2019-07-12 Mean :2019-07-12
## 3rd Qu.:2019-09-23 18:18:30 3rd Qu.:2019-09-23 3rd Qu.:2019-09-23
## Max. :2020-01-02 09:26:42 Max. :2020-01-01 Max. :2020-01-02
##
## distanceH speedMetersperMin agegroup startMonth
## Min. : 0.0 Min. :0.000 GenZ : 2173 Min. : 1.00
## 1st Qu.: 825.9 1st Qu.:1.946 Millennial:112458 1st Qu.: 5.00
## Median : 1375.9 Median :2.504 GenX : 67976 Median : 7.00
## Mean : 1779.9 Mean :2.441 Boomer : 22129 Mean : 6.86
## 3rd Qu.: 2305.2 3rd Qu.:3.033 Silent : 701 3rd Qu.: 9.00
## Max. :13812.2 Max. :8.356 VeryOld : 80 Max. :12.00
##
## stopMonth startMonthFactor stopMonthFactor
## Min. : 1.00 9 :24692 9 :24685
## 1st Qu.: 5.00 8 :23606 8 :23609
## Median : 7.00 7 :21700 7 :21698
## Mean : 6.86 10 :21089 10 :21096
## 3rd Qu.: 9.00 6 :21064 6 :21069
## Max. :12.00 5 :19218 5 :19209
## (Other):74148 (Other):74151
ggplot(data=masterdata, aes(x=newStartDate, y=tripduration, colour=gender)) + geom_point()
#newStartDate vs tripduration by gender
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=newStartDate, y=tripduration, colour=gender)) + geom_point()
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=newStartDate, y=tripduration, colour=gender)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=newStartDate, y=tripduration, colour=gender)) + geom_violin()
## Warning: position_dodge requires non-overlapping x intervals
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=newStartDate, y=tripduration, colour=gender)) + geom_boxplot()
#newStartDate vs tripduration by usertype
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=newStartDate, y=tripduration, colour=usertype)) + geom_point()
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=newStartDate, y=tripduration, colour=usertype)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=newStartDate, y=tripduration, colour=usertype)) + geom_violin()
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=newStartDate, y=tripduration, colour=usertype)) + geom_boxplot()
As one might expect, trip duration increases during warmer months and decreases as temperature drops; this suggests that traveling longer distances is either more necessary or enjoyable in warmer months. Females on average have longer trips than men. Unknown gender has the highest trip duration, and customers have higher trip durations than subscribers. Perhaps customers do not have to reveal their gender information, and perhaps these customers differ in ways other than just status as it pertains to their trip duration. Citibike managers should keep in mind that any sort of system overhauls, construction, or repair should be placed in a month with less demand so the company does not miss out on revenue from peak times.
nrow(masterdata[masterdata$PRCP < .5,])
## [1] 187806
nrow(masterdata[masterdata$PRCP >= .5 & masterdata$PRCP < 1,])
## [1] 13168
nrow(masterdata[masterdata$PRCP >= 1 & masterdata$PRCP < 1.5,])
## [1] 2712
nrow(masterdata[masterdata$PRCP >= 1.5,])
## [1] 1831
These numbers will guide the analysis below, as it is important to note that, while the averages on the y-axis may provide suggest certain insights, looking at the confidence intervals at various ranges will be useful in drawing meaningful insights. As these metrics indicate, PRCP certainly has a negative correlation with number of rides that occur, which suggests that bikers in higher PRCP may not be reflective of the typical Citibike biker.
#prcp vs tripduration by gender
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=gender)) + geom_point()
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=gender)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=gender)) + geom_violin()
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=gender)) + geom_boxplot()
#prcp vs tripduration by usertype
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=usertype)) + geom_point()
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=usertype)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=usertype)) + geom_violin()
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=usertype)) + geom_boxplot()
As precipitation increases, trip duration decreases. Females again have higher average trip duration, but they seem to have a varied correlation to preciptation. Perhaps the rise/plateau at the high PRCP levels for both males and females is influenced by people who use Citibike out of necessity. This means that the primary decrease in trip duration as PRCP increases is logical, as people who can make their trips shorter will. However, beyond a certain point, the people who cannot adjust their travel will then be bringing up the overall average trip duration. Unknown genders, who may be those who are not regular users of Citibike, are likely casual bikers who will decrease their trip lengths as much as posssible, and this is what the visualization depicts. It is curious that customers have inconsistent correlation to PRCP values. Perhaps we can infer that some rain deters users from taking long trips, while there is a certain amount of rain that is considered pleasant; this certain amount can also be an amount where casual riders do not ride, and so only bikers who bike out of need are biking in the middle range. After this middle range, perhaps even those bikers begin having to compromise on their trip lengths. Biking speed may also fluctuate and be responsible for trip duration changes.
#prcp vs distanceH by gender
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distanceH, colour=gender)) + geom_point()
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distanceH, colour=gender)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distanceH, colour=gender)) + geom_violin()
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distanceH, colour=gender)) + geom_boxplot()
#prcp vs tripduration by usertype
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distanceH, colour=usertype)) + geom_point()
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distanceH, colour=usertype)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distanceH, colour=usertype)) + geom_violin()
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distanceH, colour=usertype)) + geom_boxplot()
Amongst unknown genders, PRCP is associated with a decrease in distance. For males and females, there seems to be a decrease in distance as PRCP increases to a certain level, after which the rate of decrease diminishes. For females, the distance begins to increase, whereas for males it mostly plateaus. This, as seen previously, may be reflective of who is biking in these various PRCP ranges. In the middle range, we can infer that people try to minimize distance if they can feasibly. Perhaps as PRCP becomes drastic, only those with a need to bike will be out, who may be not be able to adjust the distance of their trip. The disparity between male response and female response here is curious. Customers, who are likely recreational/infrequent users, predictably decrease distance in correlation to increased PRCP. Subscribers reflect a response similar to the females mentioned previously.
averagePRCPMonthly <- tapply(masterdata$PRCP,masterdata$startMonthFactor,mean,)
plot(averagePRCPMonthly,xlab="Month",ylab="Average PRCP")
averageTripDurationMonthly <- tapply(masterdata$tripduration,masterdata$startMonthFactor,mean,)
plot(averageTripDurationMonthly,xlab="Month",ylab="Average Trip Duration")
numTripsMonthly <- table(masterdata$startMonth)
plot(x=averagePRCPMonthly, y=averageTripDurationMonthly)
plot(x=averagePRCPMonthly, y=numTripsMonthly)
ggplot(data=masterdata, aes(x=PRCP,y=tripduration)) + geom_point() + facet_wrap(~ startMonthFactor)
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP,y=tripduration)) + geom_smooth() + facet_wrap(~ startMonthFactor)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP,y=tripduration,colour=usertype)) + geom_smooth() + facet_wrap(~ startMonthFactor)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP,y=tripduration,colour=gender)) + geom_smooth() + facet_wrap(~ startMonthFactor)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP,y=tripduration,colour=gender)) + geom_smooth() + facet_wrap(~ usertype)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP,y=tripduration,colour=usertype)) + geom_smooth() + facet_wrap(~ gender)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
It does not appear that months with higher average PRCP correspond to lower average trip durations. This may be due to the rainier months also being warmer and more pleasant than harsh winters of NY. Perhaps the pleasant days in rainy months are very positive for bikers in general, to the extent that they compensate for rainy days. We can see that, in different months, the amount of PRCP has varied correlations with trip duration. The winter months have little to know average tripduration changes as PRCP increases, which may reflect that bikers who ride during these times are not responsive to PRCP. Customers primarily decrease trip duration as PRCP increases, except in December and June, which may be months where tourists are determined to bike no matter the PRCP; subscribers vary greatly in their responses to PRCP in each month. Similar insights can be drawn when arranging the data by gender and usertype.
ggplot(data=masterdata, aes(x=PRCP,y=distanceH)) + geom_point() + facet_wrap(~ startMonthFactor)
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP,y=distanceH)) + geom_smooth() + facet_wrap(~ startMonthFactor)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP,y=distanceH,colour=usertype)) + geom_smooth() + facet_wrap(~ startMonthFactor)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP,y=distanceH,colour=gender)) + geom_smooth() + facet_wrap(~ startMonthFactor)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP,y=distanceH,colour=gender)) + geom_smooth() + facet_wrap(~ usertype)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP,y=distanceH,colour=usertype)) + geom_smooth() + facet_wrap(~ gender)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
In colder months and August, it seems that PRCP does not have significant correlation to average distance. In other months, the correlation fluctuates or steadily yields lower distance as PRCP increases. Interestingly, April which is a very rainy month traditionally seems to have the greatest fluctuation for distance’s correlation with PRCP. Customers seem reliably unaffected by PRCP values in aggregate, except for a few interesting examples in August and May. Subscribers, again, vary greatly in their response, which may suggest that we must look into the behavioral trends of specific users to gain a full picture. While most insights from this data is fundamentally speculative, it is interesting to note the disparity in how females, males, and unknown genders vary in their response to PRCP, when separated into usertypes. Female customers seem unbothered, while female subscribers decrease distances up until a certain point and then increase again (potentially due to only necessary rides being made, which are not responsive to PRCP changes). Male customers strongly decrease distance as PRCP increases, while male subscribers reflect a similar pattern as female subscribers (potentially due to the aforementioned insight). Similar insights are yielded by separating user types into genders.
ggplot(data=masterdata, aes(x=PRCP,y=speedMetersperMin)) + geom_point()
ggplot(data=masterdata, aes(x=PRCP,y=speedMetersperMin)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data=masterdata, aes(x=PRCP,y=speedMetersperMin, colour = gender)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data=masterdata, aes(x=PRCP,y=speedMetersperMin, colour = usertype)) + geom_smooth() + facet_wrap(~ agegroup)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.
PRCP has a general positive correlation with speed, which may indicate that bikers bike faster in rainier weather. It is important to note certain fluctuations in this correlation. Perhaps the dip in speed around PRCP=1 may indicate that this amount of rain is particularly difficult to bike in, which causes bikers to slow down.